/* 
Cleans local union boards information (scraped SD SR files)
		// input: scraped_SD_SR\output_* ; union_list_clean ; estab_union_act
		// output: SD_scraped; SR_scraped; SDSR_gender 
		//			SRSD_scraped; unionboards_SDSR_clean; union_boards_panel_20112019

*/

cap log close
cap log using "$logs/clean_scrapedsdsr", replace 


*****************
*** Import SD ***
*****************

forvalues x = 2/28{
    import excel "$raw\CNES\scraped_SD_SR\output_SD`x'.xlsx", sheet("Unions_info") firstrow clear
	rename A row
	
	// expand all general info to other rows
	local toexpand req_state cnpj duration election_type executives_el voters_win_chapa start_date end_date
	foreach var of local toexpand {
	    replace `var' =  `var'[_n-1] if `var'=="" & row!=0 
	} 
	
	// clean spelling of: req_state, election_type
	replace req_state = strtrim(req_state)
	gen valid_request = (inlist(req_state, "Válida", "VÃ¡lida")) if req_state!=""
	drop req_state
	
	replace election_type = strtrim(election_type)
	gen election_typen = 1 if election_type=="Direta"
	replace election_typen = 2 if inlist(election_type, "Por decisÃ£o de assemblÃ©ia", "Por decisão de assembléia")
	replace election_typen = 3 if inlist(election_type, "Por decisÃ£o do conselho", "Por decisão do conselho")
	label define et 1 "Direct" 2 "Assembly decides" 3 "Council decides"
	label values election_typen et
	drop election_type 
	rename election_typen election_type
	
	// clean dates: end and start date, duration
	split start_date, parse("/")
	destring start_date1, replace
	destring start_date2, replace
	destring start_date3, replace

	gen date_start = mdy(start_date2,start_date1,start_date3)
	
	split end_date, parse("/")
	destring end_date1, replace
	destring end_date2, replace
	destring end_date3, replace
	
	replace end_date3 = 2011 if end_date3==2111

	gen date_end = mdy(end_date2,end_date1,end_date3)
	
	drop end_date* start_date*
	
	// clean CS and RF dummies
	gen CS_d = (CS == "X")
	gen RF_d = (RF == "X")
	drop CS RF
	rename CS_d CS
	rename RF_d RF 
	
	// fix rows that do not contain a member of union board
	replace executives_el= strtrim(executives_el)
	destring executives_el, replace
	bysort file: egen maxrow = max(row)
	drop if strpos(Name, "FEDERACAO")>0
	drop if strpos(Name, "CNPJ")>0
	
	
	// clean role and number of elected members 
	replace Role = strtrim(Role)
	gen president = (Role == "Presidente") if Role !=""
	gen vicepresident = (Role == "Vice-Presidente") if Role !=""
	gen treasurer = (Role == "Tesoureiro") if Role !=""
	gen secgen = (Role == "Secretário Geral") if Role !=""
	gen suplente = strpos(Role, "Suplente")>0 if Role !=""
	
	// dummy for whether has board 
	gen temp = (Name!="" & row!=0)
	bysort file: egen has_board_info = max(temp)
	
	// delete row 0 if has boards data
	drop if row == 0 & row[_n+1]!=0

	drop duration temp
	
	//save 
	tempfile SD`x'
	save `SD`x''
}

	// append 
	use `SD2', clear
	forvalues x = 3/28{
	    append using `SD`x''
	}
	save "$files/SD_scraped.dta", replace
	
	
	// checks
	tab file_found
	tab chapas_n
	tab sindicalizados
	tab voters
	drop row chapas_n sindicalizados file_found voters 
	destring voters_win_chapa, replace
	
	// confirm that it is because this data does not exist
	codebook file 
	preserve
		keep file
		duplicates drop
		gen totab = (file!=file[_n-1]+1)
		tab file if totab == 1
	restore
	
	drop if cnpj == ""
	order cnpj has_board_info date_start date_end Name Role, first 
	save "$files/SD_scraped.dta", replace
	
	
*****************
*** Import SR ***
*****************

forvalues x = 2/6{
    import excel "$raw\CNES\scraped_SD_SR\output_SR`x'.xlsx", sheet("Unions_info") firstrow clear
	rename A row
	
	// expand all general info to other rows
	local toexpand req_state cnpj duration election_type executives_el voters_win_chapa start_date end_date
	foreach var of local toexpand {
	    replace `var' =  `var'[_n-1] if `var'=="" & row!=0 
	} 
	
	// clean spelling of: req_state, election_type
	replace req_state = strtrim(req_state)
	gen valid_request = (inlist(req_state, "Válida", "VÃ¡lida")) if req_state!=""
	drop req_state
	
	replace election_type = strtrim(election_type)
	gen election_typen = 1 if election_type=="Direta"
	replace election_typen = 2 if inlist(election_type, "Por decisÃ£o de assemblÃ©ia", "Por decisão de assembléia")
	replace election_typen = 3 if inlist(election_type, "Por decisÃ£o do conselho", "Por decisão do conselho")
	label define et 1 "Direct" 2 "Assembly decides" 3 "Council decides"
	label values election_typen et
	drop election_type 
	rename election_typen election_type
	
	// clean dates: end and start date, duration
	split start_date, parse("/")
	destring start_date1, replace
	destring start_date2, replace
	destring start_date3, replace

	gen date_start = mdy(start_date2,start_date1,start_date3)
	
	split end_date, parse("/")
	destring end_date1, replace
	destring end_date2, replace
	destring end_date3, replace
	
	replace end_date3 = 2011 if end_date3==2111

	gen date_end = mdy(end_date2,end_date1,end_date3)
	
	drop end_date* start_date*
	
	// clean CS and RF dummies
	gen CS_d = (CS == "X")
	gen RF_d = (RF == "X")
	drop CS RF
	rename CS_d CS
	rename RF_d RF 
	
	// fix rows that do not contain a member of union board
	replace executives_el= strtrim(executives_el)
	destring executives_el, replace
	bysort file: egen maxrow = max(row)
	drop if strpos(Name, "FEDERACAO")>0
	drop if strpos(Name, "CNPJ")>0
	
	// clean role and number of elected members 
	replace Role = strtrim(Role)
	gen president = (Role == "Presidente") if Role !=""
	gen vicepresident = (Role == "Vice-Presidente") if Role !=""
	gen treasurer = (Role == "Tesoureiro") if Role !=""
	gen secgen = (Role == "Secretário Geral") if Role !=""
	gen suplente = strpos(Role, "Suplente")>0 if Role !=""
	
	// dummy for whether has board 
	gen temp = (Name!="" & row!=0)
	bysort file: egen has_board_info = max(temp)
	
	// delete row 0 if has boards data
	drop if row == 0 & row[_n+1]!=0
	
	drop duration temp
	
	//save 
	tempfile SR`x'
	save `SR`x''
}

	// append 
	use `SR2', clear
	forvalues x = 3/6{
	    append using `SR`x''
	}
	save "$files/SR_scraped.dta", replace
	
	// checks
	tab file_found
	tab chapas_n
	tab sindicalizados
	tab voters
	drop row chapas_n sindicalizados file_found voters 
	destring voters_win_chapa, replace
	
	// confirm that it is because this data does not exist
	codebook file
	preserve
		keep file
		duplicates drop
		gen totab = (file!=file[_n-1]+1)
		tab file if totab == 1
	restore
	
	drop if cnpj == ""
	order cnpj has_board_info date_start date_end Name Role, first 
	save "$files/SR_scraped.dta", replace
	
	
******************	
*** Append SD - SR
******************
	use "$files/SR_scraped.dta", clear
	append using "$files/SD_scraped.dta"
	
	// further name cleaning 
	drop if strpos(Name, "ConfederaÃ§Ã£o")>0
	drop if strpos(Name, "Confederação")>0
	drop if strpos(Name, "FederaÃ§Ã£o")>0
	drop if strpos(Name, "Federação")>0
	drop if strpos(Name, "Central Sindical")>0
	
	// add number of members from our record
	bys file: gen members_total = _N
	
	// add gender
		// prepare list of first names for R script
		replace Name =strtrim(Name)
		split Name, parse(" ")
		
		preserve
			keep Name1
			duplicates drop
			export delimited using  "$files/firstnames_SDSR.csv", replace

			/* Running genderBR on firstnames_SDSR generates "$files/genderBR/firstnames_SDSR_g.csv"
				which requires some manual modifications to reduce the number of missing
				gender information for the boards, which is why the file is already provide in
				the replication package
				
				To get the file that we later modified manually, uncomment the next three lines:
				
				local infile "$files/firstnames_SDSR.csv"
				local outfile "$files/genderBR/firstnames_SDSR_g.csv"
				shell "$RPATH" "$CODEPATH/get_gender_fromnames.R" "`infile'" "`outfile'"
		
				*/
			
			import delimited "$files/genderBR/firstnames_SDSR_g.csv", varn(1) clear
			drop v1
			rename name1 Name1
			save "$files/SDSR_gender.dta", replace
		restore
		
		merge m:1 Name1 using "$files/SDSR_gender.dta"
		drop if _m == 2
		drop _m Name1 - Name10
		
		gen female = (gender == "Female") if gender!="NA" & gender!=""
		

	// share female
	bys file: egen sh_female = mean(female)
	
	// female P or VP
	gen female_president = (female == 1 & president==1) if female!=. & Role!=""
	gen female_vicepresident = (female == 1 & vicepresident==1) if female!=. & Role!=""
	gen female_treasurer = (female == 1 & treasurer==1) if female!=. & Role!=""
	gen female_secgen = (female == 1 & secgen==1) if female!=. & Role!=""

	// CUT affiliation
	replace cnpj = strtrim(cnpj)
	
	rename cnpj union_id 
	merge m:1 union_id using "$files/union_list_clean.dta", keepusing(CUT union_type level) keep(1 3)

	save "$files/SRSD_scraped.dta", replace

	
**************
*** Create panel 
***************

use  "$files/SRSD_scraped.dta", clear

gen start_year = year(date_start)
sort union_id start_year file

//keep 2005 - 2019
keep if inrange(start_year,2005, 2019)

// keep one file per unions-year 
preserve
	keep union_id start_year file valid_request
	duplicates drop
	bys union_id start_year: gen numf = _N
	tab numf
	tab valid if numf == 1
	tab valid if numf == 2

	// criterion: if multiple files, keep one with valid request, if not enough keep the lowest filenum
	gen tokeep = 1 if numf == 1
	replace tokeep = 1 if numf >1 & valid == 1
	keep if tokeep ==1 
	drop numf tokeep
	bys union_id start_year: gen numf = _N
	tab numf 
	bys union_id start_year: egen minfile = min(file)
	keep if file == minfile
	drop numf 
	bys union_id start_year: gen numf = _N
	tab numf 
	keep union_id file start_year
	tempfile tokeep
	save `tokeep'
restore

cap drop _merge 
merge m:1 union_id start_year file using `tokeep'
keep if _m == 3 

// female president/vp at union-year level 
bys file: egen temp = max(female_president)
replace female_president = temp
drop temp
bys file: egen temp = max(female_vicepresident)
replace female_vicepresident = temp
drop temp
bys file: egen temp = max(female_treasurer)
replace female_treasurer = temp
drop temp
bys file: egen temp = max(female_secgen)
replace female_secgen = temp
drop temp

// gen size of union board 
bys file: gen board_size = _N
count if executives_el == board_size 

//gen executive board
gen exec = (Role=="Diretor")|(Role=="Membro da Junta Governativa")|(Role=="Membro de Diretoria Colegiada") ///
	|(Role=="Presidente")|(Role=="SecretÃ¡rio Geral")|(Role=="Secretário Geral") ///
	|(Role=="Tesoureiro")|(Role=="Vice-Presidente")
gen exec_fem = (female==1)&(exec==1)
egen xxx = sum(exec), by(file)
egen yyy = sum(exec_fem), by(file)
gen sh_female_exec = yyy/xxx
drop xxx yyy

//gen executive board (w/o directors)
gen execd = (Role=="Membro da Junta Governativa")|(Role=="Membro de Diretoria Colegiada") ///
	|(Role=="Presidente")|(Role=="SecretÃ¡rio Geral")|(Role=="Secretário Geral") ///
	|(Role=="Tesoureiro")|(Role=="Vice-Presidente")
gen execd_fem = (female==1)&(execd==1)
egen xxx = sum(execd), by(file)
egen yyy = sum(execd_fem), by(file)
gen sh_female_execd = yyy/xxx
drop xxx yyy
	
// keep relevant vars at union - year level 
keep union_id start_year level CUT union_type sh_female sh_female_exec sh_female_execd female_president female_vicepresident female_treasurer female_secgen board_size
duplicates drop 
isid union_id start_year

// count how many years we have on average for each union (boards are typically elected for 4 years)
bys union_id: gen countyrs = _N
tab countyrs

// gen bl board size, bl share of female and bl female president, vp, treasurer, secgen (2014)
preserve
	encode union_id, gen(union_id_num)
	xtset union_id_num start_year
	gen exists = 1
	tsfill 
	sort union_id_num start_year
	replace union_id = union_id[_n-1] if exists==. 
	local forbl sh_female sh_female_exec sh_female_execd female_president female_vicepresident female_treasurer female_secgen board_size
	foreach var of local forbl {
		replace `var' = `var'[_n-1] if exists==. 
		gen temp = `var' if start_year == 2014 
		bys union_id_num: egen `var'_bl = max(temp)
		drop temp 
	}
	keep union_id *_bl 
	duplicates drop
	tempfile baselines
	save `baselines'
restore

merge m:1 union_id using `baselines', nogen 
count if sh_female_bl !=. 

order union_id start_year CUT, first
sort union_id start_year 
save "$files/unionboards_SDSR_clean.dta", replace
 

 ****************************************************
 *** Fill-in panel
 ******************************************************

 // list of unions in our sample of CBAs
	use "$files/estab_union_act.dta", clear	 
	keep union_id
	duplicates drop
	tempfile unionlist
	save `unionlist'
	
 // load gender board data and fill in the panel for the boards 
	use "$files/unionboards_SDSR_clean.dta", clear
	bys union_id: egen maxT = max(CUT)
	bys union_id: egen minT = min(CUT)
	assert maxT == minT
	drop maxT minT
	encode union_id, gen(union_id_num)
	rename start_year year
	xtset union_id_num year 
	gen exists = 1
	tsfill, full

	keep sh_female sh_female_exec sh_female_execd female_president female_vicepresident female_treasurer female_secgen board_size union_id_num union_id year exists CUT union_type

	bys union_id_num: egen first_year = min(year) if exists == 1

	local tofill sh_female sh_female_exec sh_female_execd female_president female_vicepresident female_treasurer female_secgen board_size
	foreach var of local tofill {
				replace `var' = `var'[_n-1] if exists==. & year!=2005 
			}
						
	replace union_id = union_id[_n-1] if union_id=="" & year!=2005 
	replace union_id = union_id[_n+1] if union_id=="" & year!=2019 
	replace union_id = union_id[_n-1] if union_id=="" & year!=2005 
	replace union_id = union_id[_n+1] if union_id=="" & year!=2019 
	replace union_id = union_id[_n-1] if union_id=="" & year!=2005 
	replace union_id = union_id[_n+1] if union_id=="" & year!=2019 
	replace union_id = union_id[_n+1] if union_id=="" & year!=2019 
	replace union_id = union_id[_n+1] if union_id=="" & year!=2019 
	replace union_id = union_id[_n+1] if union_id=="" & year!=2019 
	replace union_id = union_id[_n+1] if union_id=="" & year!=2019 
	replace union_id = union_id[_n+1] if union_id=="" & year!=2019 
	replace union_id = union_id[_n+1] if union_id=="" & year!=2019 
	replace union_id = union_id[_n+1] if union_id=="" & year!=2019 
	replace union_id = union_id[_n+1] if union_id=="" & year!=2019 
	replace union_id = union_id[_n+1] if union_id=="" & year!=2019 
	replace union_id = union_id[_n+1] if union_id=="" & year!=2019 
	replace union_id = union_id[_n+1] if union_id=="" & year!=2019 
	replace union_id = union_id[_n+1] if union_id=="" & year!=2019 
	count if union_id==""

	// keep data from 2011 to 2019
	keep if inrange(year, 2011, 2019)

	// tag unions that are in our CBA sample
	cap drop _merge 
	merge m:1 union_id using `unionlist'
	drop if _merge == 2
	gen incba_sample = (_merge == 3)
	drop _merge 

	// stats on unions in cba sample or not
	unique union_id 
	unique union_id if incba_sample==1 

	// treatment defined at union level
	bys union_id_num: egen treat = max(CUT)

	save "$files/union_boards_panel_20112019", replace

cap log close
